import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(10,8)})
import jpype
from jpype import JArray, JDouble
def validate_numpy_array(func):
""" Convert any iterable to numpy array """
def wrapper(*args, **kwargs):
for item in args:
try:
# convert iterable to numpy array
len(item)
item = np.array(item)
except TypeError as e:
continue
return func(*args, **kwargs)
return wrapper
class ContinuousInfoEstimator:
def __init__(self, jar_location=None, estimator='kraskov'):
self.estimator = estimator
if not jar_location:
project_dir = 'Dropbox/Documents/Classes/InfoTheory'
jidt_dir = os.path.join(os.environ['HOME'], project_dir)
jar_location = os.path.join(jidt_dir, 'infodynamics-dist-1.5/infodynamics.jar')
assert os.path.exists(jar_location), 'jar file not found: ' + jar_location
# Start the JVM (add the "-Xmx" option with say 1024M if you get crashes due to not enough memory space)
try:
jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=" + jar_location)
# load package
self.pkg = jpype.JPackage(f"infodynamics.measures.continuous.{estimator}")
print('Estimator ready.')
except OSError:
print("Failed to start JVM. Check $JAVA_HOME environmental var.")
def list_calculators(self):
return [item for item in dir(self.pkg) if item[0].isupper() and item[0].isalpha()]
@validate_numpy_array
def transfer_entropy(self, src, dest, k=1, nearest_neighbors=4, normalize=True, auto_embed=False):
""" calculate transfer entropy in nats. """
src = np.array(src)
dest = np.array(dest)
assert len(src) == len(dest), f'Length mismatch: {len(src)}, {len(dest)}'
if self.estimator == 'gaussian':
calc = self.pkg.TransferEntropyCalculatorGaussian()
elif self.estimator == 'kraskov':
calc = self.pkg.TransferEntropyCalculatorKraskov()
if auto_embed:
calc.setProperty("AUTO_EMBED_METHOD", "MAX_CORR_AIS_DEST_ONLY")
else:
normalize_var = 'true' if normalize else 'false'
calc.setProperty("NORMALISE", normalize_var) # Normalise the individual variables
calc.setProperty("k", f"{nearest_neighbors}") # Use Kraskov parameter K=4 for 4 nearest points
calc.initialise(k) # Use history length k
src_arr = JArray(JDouble, 1)(src.tolist())
dest_arr = JArray(JDouble, 1)(dest.tolist())
calc.setObservations(src_arr, dest_arr)
return calc.computeAverageLocalOfObservations()
@validate_numpy_array
def active_information_storage(self, arr, k=3):
calc = self.pkg.ActiveInfoStorageCalculatorKraskov()
# 2. Set any properties to non-default values:
calc.setProperty("k_HISTORY", f"{k}")
calc.initialise()
calc.setObservations(arr)
return calc.computeAverageLocalOfObservations()
# utility functions
def pct_change(df):
return (df['adj_close'] - df['adj_close'].shift(1)) / df['adj_close'].shift(1)
def daily_return(df):
return np.log(df['adj_close']) - np.log(df['adj_close'].shift(1))
def compute_std(df):
return np.sqrt(np.sum((df['daily_return'] - df['daily_return'].mean()) ** 2) / (len(df) - 1))
estimator = 'kraskov'
try:
options = calc.list_calculators()
except NameError:
calc = ContinuousInfoEstimator(estimator=estimator)
options = calc.list_calculators()
for item in options:
print(item)
Estimator ready. ActiveInfoStorageCalculatorKraskov ActiveInfoStorageCalculatorMultiVariateKraskov ConditionalMutualInfoCalculatorMultiVariateKraskov ConditionalMutualInfoCalculatorMultiVariateKraskov1 ConditionalMutualInfoCalculatorMultiVariateKraskov2 ConditionalTransferEntropyCalculatorKraskov MultiInfoCalculatorKraskov MultiInfoCalculatorKraskov1 MultiInfoCalculatorKraskov2 MutualInfoCalculatorMultiVariateKraskov MutualInfoCalculatorMultiVariateKraskov1 MutualInfoCalculatorMultiVariateKraskov2 PredictiveInfoCalculatorKraskov TransferEntropyCalculatorKraskov TransferEntropyCalculatorMultiVariateKraskov
from scipy.special import digamma
from sklearn.neighbors import KDTree
def entropy(x, k=4, base=2):
""" The classic K-L k-nearest neighbor continuous entropy estimator
x should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]]
if x is a one-dimensional scalar and we have four samples
"""
assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
x = np.asarray(x)
n_elements, n_features = x.shape
x = add_noise(x)
tree = build_tree(x)
nn = query_neighbors(tree, x, k)
const = digamma(n_elements) - digamma(k) + n_features * np.log(2)
return (const + n_features * np.log(nn).mean()) / np.log(base)
def add_noise(x, intens=1e-10):
# small noise to break degeneracy, see doc.
return x + intens * np.random.random_sample(x.shape)
def query_neighbors(tree, x, k):
return tree.query(x, k=k + 1)[0][:, k]
def build_tree(points):
if points.shape[1] >= 20:
return BallTree(points, metric='chebyshev')
return KDTree(points, metric='chebyshev')
dfs = {}
for file in os.listdir('data/index'):
if file.startswith('^'):
fn, ext = os.path.splitext(file)
dfs[fn[1:]] = pd.read_csv(os.path.join('data/index', file))
dfs.keys()
dict_keys(['N225', 'GSPC', 'N100', 'BVSP'])
for df in dfs.values():
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
df['date'] = pd.to_datetime(df['date'])
for key in dfs.keys():
print(key)
print(dfs[key]['date'].describe(datetime_is_numeric=True))
print('')
N225 count 14356 mean 1992-08-16 21:16:48.080245120 min 1965-01-06 00:00:00 25% 1978-10-09 18:00:00 50% 1992-07-11 12:00:00 75% 2006-04-13 06:00:00 max 2020-10-29 00:00:00 Name: date, dtype: object GSPC count 23317 mean 1974-07-03 11:10:15.259252912 min 1928-01-03 00:00:00 25% 1951-05-04 00:00:00 50% 1974-08-09 00:00:00 75% 1997-08-29 00:00:00 max 2020-10-28 00:00:00 Name: date, dtype: object N100 count 5357 mean 2010-05-17 12:58:27.933544960 min 2000-01-03 00:00:00 25% 2005-02-18 00:00:00 50% 2010-05-13 00:00:00 75% 2015-08-07 00:00:00 max 2020-10-30 00:00:00 Name: date, dtype: object BVSP count 6984 mean 2006-11-27 21:07:50.103092992 min 1993-04-28 00:00:00 25% 2000-01-05 18:00:00 50% 2006-09-30 12:00:00 75% 2013-10-17 06:00:00 max 2020-10-28 00:00:00 Name: date, dtype: object
for df in dfs.values():
# df['pct_change'] = pct_change(df)
df['daily_return'] = daily_return(df)
dfs['GSPC'].head()
| date | open | high | low | close | adj_close | volume | daily_return | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1928-01-03 | 17.760000 | 17.760000 | 17.760000 | 17.760000 | 17.760000 | 0 | NaN |
| 1 | 1928-01-04 | 17.719999 | 17.719999 | 17.719999 | 17.719999 | 17.719999 | 0 | -0.002255 |
| 2 | 1928-01-05 | 17.549999 | 17.549999 | 17.549999 | 17.549999 | 17.549999 | 0 | -0.009640 |
| 3 | 1928-01-06 | 17.660000 | 17.660000 | 17.660000 | 17.660000 | 17.660000 | 0 | 0.006248 |
| 4 | 1928-01-09 | 17.500000 | 17.500000 | 17.500000 | 17.500000 | 17.500000 | 0 | -0.009101 |
from functools import reduce
cols = ['open', 'high', 'low', 'close', 'adj_close', 'volume']
for key in dfs.keys():
dfs[key].rename(columns={'daily_return': key}, inplace=True)
dfs[key].drop(cols, 1, inplace=True)
f = lambda left, right: pd.merge(left, right, on='date', how='inner')
data = reduce(f, dfs.values())
data.head()
| date | N225 | GSPC | N100 | BVSP | |
|---|---|---|---|---|---|
| 0 | 2000-01-03 | NaN | -0.009595 | NaN | NaN |
| 1 | 2000-01-04 | NaN | -0.039099 | -0.041794 | -0.065855 |
| 2 | 2000-01-05 | -0.024521 | 0.001920 | -0.027262 | 0.024553 |
| 3 | 2000-01-06 | -0.020391 | 0.000955 | -0.008420 | -0.008531 |
| 4 | 2000-01-07 | 0.001383 | 0.026730 | 0.022955 | 0.012463 |
data.isnull().sum()
date 0 N225 205 GSPC 0 N100 36 BVSP 143 dtype: int64
data = data.fillna(method='ffill', limit=4)
data.head()
| date | N225 | GSPC | N100 | BVSP | |
|---|---|---|---|---|---|
| 0 | 2000-01-03 | NaN | -0.009595 | NaN | NaN |
| 1 | 2000-01-04 | NaN | -0.039099 | -0.041794 | -0.065855 |
| 2 | 2000-01-05 | -0.024521 | 0.001920 | -0.027262 | 0.024553 |
| 3 | 2000-01-06 | -0.020391 | 0.000955 | -0.008420 | -0.008531 |
| 4 | 2000-01-07 | 0.001383 | 0.026730 | 0.022955 | 0.012463 |
data.isnull().sum()
date 0 N225 6 GSPC 0 N100 1 BVSP 1 dtype: int64
data.dropna(inplace=True)
data
| date | N225 | GSPC | N100 | BVSP | |
|---|---|---|---|---|---|
| 2 | 2000-01-05 | -0.024521 | 0.001920 | -0.027262 | 0.024553 |
| 3 | 2000-01-06 | -0.020391 | 0.000955 | -0.008420 | -0.008531 |
| 4 | 2000-01-07 | 0.001383 | 0.026730 | 0.022955 | 0.012463 |
| 5 | 2000-01-10 | 0.001383 | 0.011128 | 0.017163 | 0.042790 |
| 6 | 2000-01-11 | 0.001383 | -0.013149 | -0.006436 | -0.026732 |
| ... | ... | ... | ... | ... | ... |
| 4886 | 2020-10-22 | -0.007012 | 0.005205 | -0.001982 | 0.013494 |
| 4887 | 2020-10-23 | 0.001801 | 0.003440 | 0.007600 | -0.006477 |
| 4888 | 2020-10-26 | -0.000947 | -0.018764 | -0.016265 | -0.002403 |
| 4889 | 2020-10-27 | -0.000364 | -0.003030 | -0.010426 | -0.014066 |
| 4890 | 2020-10-28 | -0.002869 | -0.035926 | -0.029471 | -0.043469 |
4885 rows × 5 columns
data.describe()
| N225 | GSPC | N100 | BVSP | |
|---|---|---|---|---|
| count | 4885.000000 | 4885.000000 | 4885.000000 | 4885.000000 |
| mean | -0.000054 | 0.000154 | 0.000019 | 0.000352 |
| std | 0.014694 | 0.012465 | 0.013142 | 0.017945 |
| min | -0.121110 | -0.127652 | -0.127517 | -0.159930 |
| 25% | -0.007039 | -0.004813 | -0.005979 | -0.009430 |
| 50% | 0.000278 | 0.000588 | 0.000604 | 0.000584 |
| 75% | 0.007945 | 0.005805 | 0.006518 | 0.010869 |
| max | 0.132346 | 0.102457 | 0.084688 | 0.130223 |
df = pd.melt(data, id_vars=['date'])
px.line(df, x='date', y='value', color='variable', facet_row='variable', title='Daily return over time', height=1000, width=1000)
px.histogram(df, x='value', color='variable', facet_row='variable', width=1000, height=1000, title='Distribution of daily return')
data.isnull().sum()
date 0 N225 0 GSPC 0 N100 0 BVSP 0 dtype: int64
data.dropna(inplace=True)
print('entropy')
for col in data.columns:
if col != 'date':
h = entropy(data[col].dropna().values.reshape(-1, 1), base=math.e)
print(f"{col} : {h:.2f}")
entropy N225 : -3.01 GSPC : -3.12 N100 : -3.05 BVSP : -2.70
select_data = data[['date', 'GSPC', 'N225']]
select_data = select_data[select_data['date'] < '2020-10-01']
select_data['date'].describe(datetime_is_numeric=True)
count 4866 mean 2010-03-07 07:41:39.136868096 min 2000-01-05 00:00:00 25% 2004-11-10 06:00:00 50% 2010-01-28 12:00:00 75% 2015-06-17 18:00:00 max 2020-09-30 00:00:00 Name: date, dtype: object
select_data.isnull().sum()
date 0 GSPC 0 N225 0 dtype: int64
calc.list_calculators()
['ActiveInfoStorageCalculatorKraskov', 'ActiveInfoStorageCalculatorMultiVariateKraskov', 'ConditionalMutualInfoCalculatorMultiVariateKraskov', 'ConditionalMutualInfoCalculatorMultiVariateKraskov1', 'ConditionalMutualInfoCalculatorMultiVariateKraskov2', 'ConditionalTransferEntropyCalculatorKraskov', 'MultiInfoCalculatorKraskov', 'MultiInfoCalculatorKraskov1', 'MultiInfoCalculatorKraskov2', 'MutualInfoCalculatorMultiVariateKraskov', 'MutualInfoCalculatorMultiVariateKraskov1', 'MutualInfoCalculatorMultiVariateKraskov2', 'PredictiveInfoCalculatorKraskov', 'TransferEntropyCalculatorKraskov', 'TransferEntropyCalculatorMultiVariateKraskov']
cols = ['year', 'month', 't_GSPCtoN225', 't_N225toGSPC']
indices = ['GSPC', 'N225']
K = 4
monthly_stats = []
for yr in range(1993, 2021):
for mm in range(1, 13):
t_data = select_data[(select_data['date'].dt.year == yr) & (select_data['date'].dt.month == mm)]
if len(t_data) > K:
row = [yr, mm]
row += calc.transfer_entropy(src=t_data['GSPC'].values, dest=t_data['N225'].values, k=1, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data['N225'].values, dest=t_data['GSPC'].values, k=1, nearest_neighbors=K),
monthly_stats += row,
monthly = pd.DataFrame(monthly_stats, columns=cols)
monthly['yymmdd'] = monthly['year'].astype(str) + '-' + monthly['month'].astype(str)
monthly['yymmdd'] = pd.to_datetime(monthly['yymmdd'])
monthly.head()
| year | month | t_GSPCtoN225 | t_N225toGSPC | yymmdd | |
|---|---|---|---|---|---|
| 0 | 2000 | 1 | -0.012061 | -0.050865 | 2000-01-01 |
| 1 | 2000 | 2 | 0.075308 | -0.036129 | 2000-02-01 |
| 2 | 2000 | 3 | -0.023637 | -0.055256 | 2000-03-01 |
| 3 | 2000 | 4 | 0.118796 | 0.004676 | 2000-04-01 |
| 4 | 2000 | 5 | 0.175229 | -0.031737 | 2000-05-01 |
f = lambda x: max(0, x)
monthly['t_GSPCtoN225'] = monthly['t_GSPCtoN225'].apply(f)
monthly['t_N225toGSPC'] = monthly['t_N225toGSPC'].apply(f)
monthly.describe()
| year | month | t_GSPCtoN225 | t_N225toGSPC | |
|---|---|---|---|---|
| count | 249.000000 | 249.000000 | 249.000000 | 249.000000 |
| mean | 2009.879518 | 6.445783 | 0.074558 | 0.021942 |
| std | 6.002817 | 3.442803 | 0.075210 | 0.035687 |
| min | 2000.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 2005.000000 | 3.000000 | 0.006179 | 0.000000 |
| 50% | 2010.000000 | 6.000000 | 0.057718 | 0.000000 |
| 75% | 2015.000000 | 9.000000 | 0.116811 | 0.033758 |
| max | 2020.000000 | 12.000000 | 0.331706 | 0.251322 |
# monthly.style.bar(subset=['t_GSPCtoN225', 't_N225toGSPC'], align='left', color='#5fba7d')
# where red area is above blue area is where SPY has more influence on Nikkei than the other way around
px.area(monthly, x='yymmdd', y=['t_GSPCtoN225', 't_N225toGSPC'])
# px.line(monthly, x='yymmdd', y=['t_GSPCtoN225', 't_N225toGSPC'])
sns.kdeplot(monthly['t_GSPCtoN225'].values, color='blue', label='GSPC -> N225')
sns.kdeplot(monthly['t_N225toGSPC'].values, color='red', label='N225 -> GSPC')
plt.legend()
<matplotlib.legend.Legend at 0x7fd19e4d8d50>
f = lambda x: max(x, 0)
monthly['net_info_flow'] = monthly['t_GSPCtoN225'].apply(f) - monthly['t_N225toGSPC'].apply(f)
px.bar(monthly, x='yymmdd', y='net_info_flow', title='Net information flow from GSPC to N225')
# net information flow from GSPC -> N225
# monthly[['year', 'month', 'net_info_flow']].style.bar(subset=['net_info_flow'], align='mid', color=['#d65f5f', '#5fba7d'])
cols = ['year', 't_GSPCtoN225', 't_N225toGSPC']
indices = ['GSPC', 'N225']
K = 4
yearly_stats = []
for yr in range(1993, 2021):
t_data = select_data[select_data['date'].dt.year == yr]
if len(t_data) > K:
row = [yr]
row += calc.transfer_entropy(src=t_data['GSPC'].values, dest=t_data['N225'].values, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data['N225'].values, dest=t_data['GSPC'].values, nearest_neighbors=K),
yearly_stats += row,
yearly = pd.DataFrame(yearly_stats, columns=cols)
yearly['yymmdd'] = yearly['year'].astype(str) + '-01-01'
yearly['yymmdd'] = pd.to_datetime(yearly['yymmdd'])
yearly.head()
| year | t_GSPCtoN225 | t_N225toGSPC | yymmdd | |
|---|---|---|---|---|
| 0 | 2000 | 0.073134 | 0.003553 | 2000-01-01 |
| 1 | 2001 | 0.042647 | -0.006569 | 2001-01-01 |
| 2 | 2002 | 0.069011 | 0.008767 | 2002-01-01 |
| 3 | 2003 | 0.089585 | -0.004480 | 2003-01-01 |
| 4 | 2004 | 0.047205 | 0.001261 | 2004-01-01 |
f = lambda x: max(0, x)
yearly['t_GSPCtoN225'] = yearly['t_GSPCtoN225'].apply(f)
yearly['t_N225toGSPC'] = yearly['t_N225toGSPC'].apply(f)
yearly.describe()
| year | t_GSPCtoN225 | t_N225toGSPC | |
|---|---|---|---|
| count | 21.000000 | 21.000000 | 21.000000 |
| mean | 2010.000000 | 0.132232 | 0.014940 |
| std | 6.204837 | 0.073015 | 0.019037 |
| min | 2000.000000 | 0.039777 | 0.000000 |
| 25% | 2005.000000 | 0.068499 | 0.000000 |
| 50% | 2010.000000 | 0.125985 | 0.003553 |
| 75% | 2015.000000 | 0.201544 | 0.030833 |
| max | 2020.000000 | 0.269289 | 0.054382 |
px.line(yearly, x='year', y=['t_GSPCtoN225', 't_N225toGSPC'], title='Transfer Entropy between GSPC and N225', width=1000, height=600)
yearly['net_info_flow'] = yearly['t_GSPCtoN225'].apply(f) - yearly['t_N225toGSPC'].apply(f)
px.bar(yearly, x='yymmdd', y='net_info_flow', title='Net Information Flow GSPC -> N225', width=1000, height=600)
# net information flow from GSPC -> N225
yearly.drop('yymmdd', 1).style.bar(subset=['t_GSPCtoN225', 't_N225toGSPC', 'net_info_flow'], align='mid', color=['#d65f5f', '#5fba7d'], axis=None)
| year | t_GSPCtoN225 | t_N225toGSPC | net_info_flow | |
|---|---|---|---|---|
| 0 | 2000 | 0.073134 | 0.003553 | 0.069581 |
| 1 | 2001 | 0.042647 | 0.000000 | 0.042647 |
| 2 | 2002 | 0.069011 | 0.008767 | 0.060244 |
| 3 | 2003 | 0.089585 | 0.000000 | 0.089585 |
| 4 | 2004 | 0.047205 | 0.001261 | 0.045944 |
| 5 | 2005 | 0.053783 | 0.035861 | 0.017922 |
| 6 | 2006 | 0.068499 | 0.000000 | 0.068499 |
| 7 | 2007 | 0.120493 | 0.000000 | 0.120493 |
| 8 | 2008 | 0.152692 | 0.001503 | 0.151189 |
| 9 | 2009 | 0.207931 | 0.030833 | 0.177098 |
| 10 | 2010 | 0.229401 | 0.002642 | 0.226759 |
| 11 | 2011 | 0.269289 | 0.015190 | 0.254099 |
| 12 | 2012 | 0.147744 | 0.039211 | 0.108533 |
| 13 | 2013 | 0.125985 | 0.008871 | 0.117115 |
| 14 | 2014 | 0.211213 | 0.054382 | 0.156831 |
| 15 | 2015 | 0.168373 | 0.050655 | 0.117718 |
| 16 | 2016 | 0.054729 | 0.016947 | 0.037783 |
| 17 | 2017 | 0.039777 | 0.000000 | 0.039777 |
| 18 | 2018 | 0.216686 | 0.000000 | 0.216686 |
| 19 | 2019 | 0.201544 | 0.044058 | 0.157486 |
| 20 | 2020 | 0.187151 | 0.000000 | 0.187151 |
data.isnull().sum()
date 0 N225 0 GSPC 0 N100 0 BVSP 0 dtype: int64
data['date'].describe(datetime_is_numeric=True)
count 4885 mean 2010-03-22 09:22:08.720573184 min 2000-01-05 00:00:00 25% 2004-11-17 00:00:00 50% 2010-02-12 00:00:00 75% 2015-07-10 00:00:00 max 2020-10-28 00:00:00 Name: date, dtype: object
indices = [col for col in data.columns if col != 'date']
indices
['N225', 'GSPC', 'N100', 'BVSP']
all_transfer_entropy = np.zeros((len(indices), len(indices)))
all_transfer_entropy.shape
(4, 4)
for src_idx in range(len(indices)):
for dest_idx in range(len(indices)):
all_transfer_entropy[src_idx][dest_idx] = calc.transfer_entropy(src=data[indices[src_idx]].values,
dest=data[indices[dest_idx]].values,
nearest_neighbors=K)
all_transfer_entropy[src_idx][dest_idx] = max(0, all_transfer_entropy[src_idx][dest_idx])
Row index -> source index
Column -> Destination index
df = pd.DataFrame(all_transfer_entropy, columns=indices, index=indices)
df.style.background_gradient(axis=None)
| N225 | GSPC | N100 | BVSP | |
|---|---|---|---|---|
| N225 | 0.000000 | 0.018052 | 0.022705 | 0.010250 |
| GSPC | 0.131662 | 0.000000 | 0.051715 | 0.005894 |
| N100 | 0.084533 | 0.034913 | 0.000000 | 0.000000 |
| BVSP | 0.036315 | 0.000000 | 0.003056 | 0.000000 |
plt.figure(figsize=(10,10))
sns.heatmap(df, annot=True, square=True)
plt.ylabel('Source')
plt.xlabel('Destination')
Text(0.5, 115.09999999999997, 'Destination')
# row wise summation = summation by source
te = df.values.sum(axis=1)
# normalize
te /= te.sum()
df = pd.DataFrame(np.stack([indices, te]).T, columns=['ETF', 'Transfer_entropy_contrib'])
df.sort_values('Transfer_entropy_contrib', ascending=False)
| ETF | Transfer_entropy_contrib | |
|---|---|---|
| 1 | GSPC | 0.47425127919485627 |
| 2 | N100 | 0.2992923005418552 |
| 0 | N225 | 0.1278072338387267 |
| 3 | BVSP | 0.09864918642456186 |
| Index | Name | Industry | Source |
|---|---|---|---|
USO |
United States Oil Fund | Oil | Link |
ICLN |
iShares Global Clean Energy ETF | Renewable Energy | Link |
JETS |
U.S. Global Jets ETF | Airline | Link |
IYT |
iShares Transportation Average ETF | Transportation | Link |
XLP |
Consumer Staples Select Sector SPDR Fund | Consumer staples | Link |
SMH |
VanEck Vectors Semiconductor ETF | Semiconductor | Link |
IXP |
iShares Global Comm Services ETF | Telecom | Link |
VGT |
Vanguard Information Technology Index Fund | Technology | Link |
XPH |
SPDR S&P Pharmaceuticals ETF | Pharmaceutical | Link |
industries = {
'USO' : 'Oil',
'ICLN': 'Renewable Energy',
'JETS': 'Airline',
'IYT' : 'Transportation',
'XLP' : 'Consumer Staples',
'SMH' : 'Semiconductor',
'IXP' : 'Telecom',
'VGT': 'Technology',
'XPH' : 'Pharmaceutical'
}
dfs = {}
path = 'data/etf'
for file in os.listdir(path):
if file.endswith('csv'):
fn, ext = os.path.splitext(file)
dfs[fn] = pd.read_csv(os.path.join(path, file))
dfs.keys()
dict_keys(['XPH', 'SMH', 'ICLN', 'JETS', 'XLP', 'USO', 'IXP', 'IYT', 'VGT'])
for df in dfs.values():
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
df['date'] = pd.to_datetime(df['date'])
for key in dfs.keys():
print(f"{key:>4} {dfs[key]['date'].dt.date.min()} {dfs[key]['date'].dt.date.max()}")
XPH 2006-07-03 2020-10-30 SMH 2000-07-03 2020-10-30 ICLN 2008-07-01 2020-10-30 JETS 2015-05-01 2020-10-30 XLP 1999-01-04 2020-10-30 USO 2006-05-01 2020-10-30 IXP 2001-12-03 2020-10-30 IYT 2004-01-05 2020-10-30 VGT 2004-02-02 2020-10-30
for df in dfs.values():
df['daily_return'] = daily_return(df)
from functools import reduce
cols = ['open', 'high', 'low', 'close', 'adj_close', 'volume']
for key in dfs.keys():
dfs[key].rename(columns={'daily_return': key}, inplace=True)
dfs[key].drop(cols, 1, inplace=True)
f = lambda left, right: pd.merge(left, right, on='date', how='outer')
etf = reduce(f, dfs.values())
etf.head()
| date | XPH | SMH | ICLN | JETS | XLP | USO | IXP | IYT | VGT | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2006-07-03 | NaN | 0.012981 | NaN | NaN | 0.005771 | 0.002291 | 0.007590 | 0.011234 | 0.010543 |
| 1 | 2006-07-05 | 0.0 | -0.027982 | NaN | NaN | -0.008254 | 0.015185 | -0.007019 | -0.011911 | -0.022512 |
| 2 | 2006-07-06 | 0.0 | -0.001762 | NaN | NaN | 0.011537 | -0.003245 | 0.003610 | -0.007147 | 0.002187 |
| 3 | 2006-07-07 | 0.0 | -0.016220 | NaN | NaN | -0.002050 | -0.013085 | -0.004753 | -0.008921 | -0.012308 |
| 4 | 2006-07-10 | 0.0 | -0.022580 | NaN | NaN | 0.006138 | -0.004160 | 0.002855 | 0.004471 | -0.015377 |
etf.isnull().sum()
date 0 XPH 1886 SMH 379 ICLN 2388 JETS 4108 XLP 1 USO 1842 IXP 733 IYT 1258 VGT 1277 dtype: int64
cols = [col for col in etf.columns if col != 'date']
cols
['XPH', 'SMH', 'ICLN', 'JETS', 'XLP', 'USO', 'IXP', 'IYT', 'VGT']
all_transfer_entropy = np.zeros((len(cols), len(cols)))
all_transfer_entropy.shape
(9, 9)
for src_idx in range(len(cols)):
for dest_idx in range(len(cols)):
if src_idx == dest_idx:
continue
src = etf[['date'] + [cols[src_idx]]]
dest = etf[['date'] + [cols[dest_idx]]]
merged = pd.merge(src, dest, on='date', how='inner').copy()
merged.dropna(inplace=True)
all_transfer_entropy[src_idx][dest_idx] = calc.transfer_entropy(src=merged[cols[src_idx]].values,
dest=merged[cols[dest_idx]].values,
nearest_neighbors=K)
all_transfer_entropy[src_idx][dest_idx] = max(0, all_transfer_entropy[src_idx][dest_idx])
df = pd.DataFrame(all_transfer_entropy, columns=cols, index=cols)
df.style.background_gradient(axis=None)
| XPH | SMH | ICLN | JETS | XLP | USO | IXP | IYT | VGT | |
|---|---|---|---|---|---|---|---|---|---|
| XPH | 0.000000 | 0.009539 | 0.000000 | 0.027026 | 0.014257 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| SMH | 0.022693 | 0.000000 | 0.000068 | 0.002070 | 0.000288 | 0.000000 | 0.005432 | 0.000000 | 0.001041 |
| ICLN | 0.010220 | 0.000880 | 0.000000 | 0.012156 | 0.000000 | 0.025376 | 0.010972 | 0.000000 | 0.000252 |
| JETS | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.021134 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| XLP | 0.000000 | 0.004374 | 0.010205 | 0.013910 | 0.000000 | 0.021285 | 0.000000 | 0.004582 | 0.008809 |
| USO | 0.005098 | 0.000000 | 0.001613 | 0.000000 | 0.000000 | 0.000000 | 0.000844 | 0.019538 | 0.011730 |
| IXP | 0.000000 | 0.011098 | 0.014798 | 0.001633 | 0.000000 | 0.000000 | 0.000000 | 0.001099 | 0.000000 |
| IYT | 0.016727 | 0.018520 | 0.012567 | 0.000000 | 0.000000 | 0.011099 | 0.018928 | 0.000000 | 0.020067 |
| VGT | 0.005194 | 0.000000 | 0.017997 | 0.000000 | 0.016225 | 0.000000 | 0.003840 | 0.007154 | 0.000000 |
ticker_to_industry = lambda x: industries[x]
df.columns = [ticker_to_industry(ticker) for ticker in df.columns]
df.index = [ticker_to_industry(ticker) for ticker in df.index]
df.style.background_gradient(axis=None)
| Pharmaceutical | Semiconductor | Renewable Energy | Airline | Consumer Staples | Oil | Telecom | Transportation | Technology | |
|---|---|---|---|---|---|---|---|---|---|
| Pharmaceutical | 0.000000 | 0.009539 | 0.000000 | 0.027026 | 0.014257 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| Semiconductor | 0.022693 | 0.000000 | 0.000068 | 0.002070 | 0.000288 | 0.000000 | 0.005432 | 0.000000 | 0.001041 |
| Renewable Energy | 0.010220 | 0.000880 | 0.000000 | 0.012156 | 0.000000 | 0.025376 | 0.010972 | 0.000000 | 0.000252 |
| Airline | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.021134 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| Consumer Staples | 0.000000 | 0.004374 | 0.010205 | 0.013910 | 0.000000 | 0.021285 | 0.000000 | 0.004582 | 0.008809 |
| Oil | 0.005098 | 0.000000 | 0.001613 | 0.000000 | 0.000000 | 0.000000 | 0.000844 | 0.019538 | 0.011730 |
| Telecom | 0.000000 | 0.011098 | 0.014798 | 0.001633 | 0.000000 | 0.000000 | 0.000000 | 0.001099 | 0.000000 |
| Transportation | 0.016727 | 0.018520 | 0.012567 | 0.000000 | 0.000000 | 0.011099 | 0.018928 | 0.000000 | 0.020067 |
| Technology | 0.005194 | 0.000000 | 0.017997 | 0.000000 | 0.016225 | 0.000000 | 0.003840 | 0.007154 | 0.000000 |
# row wise summation = summation by source
te = df.values.sum(axis=1)
# normalize
te /= te.sum()
df = pd.DataFrame(np.stack([cols, te]).T, columns=['ETF', 'Transfer_entropy_contrib'])
df.sort_values('Transfer_entropy_contrib', ascending=False, inplace=True)
df['industry'] = df['ETF'].apply(lambda x: industries[x])
df
| ETF | Transfer_entropy_contrib | industry | |
|---|---|---|---|
| 7 | IYT | 0.22234015289467507 | Transportation |
| 4 | XLP | 0.1422194180891907 | Consumer Staples |
| 2 | ICLN | 0.13378726776196836 | Renewable Energy |
| 8 | VGT | 0.11548041637751796 | Technology |
| 0 | XPH | 0.11474490472173349 | Pharmaceutical |
| 5 | USO | 0.08703802014644366 | Oil |
| 1 | SMH | 0.07086608369622154 | Semiconductor |
| 6 | IXP | 0.06591261642531517 | Telecom |
| 3 | JETS | 0.04761111988693408 | Airline |
for src_idx in range(len(cols)):
for dest_idx in range(len(cols)):
if src_idx == dest_idx:
continue
src = etf[['date'] + [cols[src_idx]]]
dest = etf[['date'] + [cols[dest_idx]]]
merged = pd.merge(src, dest, on='date', how='outer').copy()
merged = merged[merged.date > '2015-05-02']
all_transfer_entropy[src_idx][dest_idx] = calc.transfer_entropy(src=merged[cols[src_idx]].values,
dest=merged[cols[dest_idx]].values,
nearest_neighbors=K)
all_transfer_entropy[src_idx][dest_idx] = max(0, all_transfer_entropy[src_idx][dest_idx])
df = pd.DataFrame(all_transfer_entropy, columns=cols, index=cols)
df.style.background_gradient(axis=None)
| XPH | SMH | ICLN | JETS | XLP | USO | IXP | IYT | VGT | |
|---|---|---|---|---|---|---|---|---|---|
| XPH | 0.000000 | 0.002002 | 0.000000 | 0.026929 | 0.008916 | 0.000000 | 0.005017 | 0.018801 | 0.000000 |
| SMH | 0.000000 | 0.000000 | 0.016593 | 0.001983 | 0.005971 | 0.000000 | 0.000501 | 0.013770 | 0.000000 |
| ICLN | 0.042140 | 0.000000 | 0.000000 | 0.012476 | 0.000000 | 0.017954 | 0.000000 | 0.018155 | 0.012681 |
| JETS | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.021155 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| XLP | 0.001649 | 0.000000 | 0.016916 | 0.013869 | 0.000000 | 0.000000 | 0.000000 | 0.012087 | 0.000000 |
| USO | 0.017203 | 0.000000 | 0.010774 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.005790 | 0.014790 |
| IXP | 0.003840 | 0.003950 | 0.013165 | 0.001598 | 0.023812 | 0.002834 | 0.000000 | 0.000000 | 0.000000 |
| IYT | 0.005510 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.010406 | 0.000000 | 0.000000 |
| VGT | 0.000000 | 0.000000 | 0.008680 | 0.000000 | 0.021081 | 0.017188 | 0.004864 | 0.000000 | 0.000000 |
ticker_to_industry = lambda x: industries[x]
df.columns = [ticker_to_industry(ticker) for ticker in df.columns]
df.index = [ticker_to_industry(ticker) for ticker in df.index]
df.style.background_gradient(axis=None)
| Pharmaceutical | Semiconductor | Renewable Energy | Airline | Consumer Staples | Oil | Telecom | Transportation | Technology | |
|---|---|---|---|---|---|---|---|---|---|
| Pharmaceutical | 0.000000 | 0.002002 | 0.000000 | 0.026929 | 0.008916 | 0.000000 | 0.005017 | 0.018801 | 0.000000 |
| Semiconductor | 0.000000 | 0.000000 | 0.016593 | 0.001983 | 0.005971 | 0.000000 | 0.000501 | 0.013770 | 0.000000 |
| Renewable Energy | 0.042140 | 0.000000 | 0.000000 | 0.012476 | 0.000000 | 0.017954 | 0.000000 | 0.018155 | 0.012681 |
| Airline | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.021155 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| Consumer Staples | 0.001649 | 0.000000 | 0.016916 | 0.013869 | 0.000000 | 0.000000 | 0.000000 | 0.012087 | 0.000000 |
| Oil | 0.017203 | 0.000000 | 0.010774 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.005790 | 0.014790 |
| Telecom | 0.003840 | 0.003950 | 0.013165 | 0.001598 | 0.023812 | 0.002834 | 0.000000 | 0.000000 | 0.000000 |
| Transportation | 0.005510 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.010406 | 0.000000 | 0.000000 |
| Technology | 0.000000 | 0.000000 | 0.008680 | 0.000000 | 0.021081 | 0.017188 | 0.004864 | 0.000000 | 0.000000 |
# row wise summation = summation by source
te = df.values.sum(axis=1)
# normalize
te /= te.sum()
df = pd.DataFrame(np.stack([cols, te]).T, columns=['ETF', 'Transfer_entropy_contrib'])
df.sort_values('Transfer_entropy_contrib', ascending=False, inplace=True)
df['industry'] = df['ETF'].apply(lambda x: industries[x])
df
| ETF | Transfer_entropy_contrib | industry | |
|---|---|---|---|
| 2 | ICLN | 0.2360911037480295 | Renewable Energy |
| 0 | XPH | 0.14269329488399313 | Pharmaceutical |
| 8 | VGT | 0.11813689477084147 | Technology |
| 5 | USO | 0.11334413284475249 | Oil |
| 6 | IXP | 0.11265432557711536 | Telecom |
| 4 | XLP | 0.10046068274679416 | Consumer Staples |
| 1 | SMH | 0.09084799620798856 | Semiconductor |
| 3 | JETS | 0.048882765022124075 | Airline |
| 7 | IYT | 0.03688880419836126 | Transportation |
columns = ['Year'] + cols
columns
['Year', 'XPH', 'SMH', 'ICLN', 'JETS', 'XLP', 'USO', 'IXP', 'IYT', 'VGT']
etf['date'].describe(datetime_is_numeric=True)
count 5494 mean 2009-12-03 09:06:13.498361856 min 1999-01-04 00:00:00 25% 2004-06-22 06:00:00 50% 2009-12-02 12:00:00 75% 2015-05-19 18:00:00 max 2020-10-30 00:00:00 Name: date, dtype: object
etf[etf['date'].dt.year == 2001]
| date | XPH | SMH | ICLN | JETS | XLP | USO | IXP | IYT | VGT | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3735 | 2001-01-02 | NaN | -0.014388 | NaN | NaN | -0.013216 | NaN | NaN | NaN | NaN |
| 3736 | 2001-01-03 | NaN | 0.152374 | NaN | NaN | -0.047100 | NaN | NaN | NaN | NaN |
| 3737 | 2001-01-04 | NaN | -0.031422 | NaN | NaN | -0.040314 | NaN | NaN | NaN | NaN |
| 3738 | 2001-01-05 | NaN | -0.061669 | NaN | NaN | 0.006632 | NaN | NaN | NaN | NaN |
| 3739 | 2001-01-08 | NaN | 0.014336 | NaN | NaN | 0.006588 | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3978 | 2001-12-24 | NaN | -0.008629 | NaN | NaN | -0.000389 | NaN | 0.000687 | NaN | NaN |
| 3979 | 2001-12-26 | NaN | 0.008867 | NaN | NaN | 0.002715 | NaN | -0.002946 | NaN | NaN |
| 3980 | 2001-12-27 | NaN | 0.017268 | NaN | NaN | -0.006216 | NaN | 0.012313 | NaN | NaN |
| 3981 | 2001-12-28 | NaN | 0.012353 | NaN | NaN | -0.004296 | NaN | 0.005038 | NaN | NaN |
| 3982 | 2001-12-31 | NaN | -0.031054 | NaN | NaN | -0.005888 | NaN | -0.001547 | NaN | NaN |
248 rows × 10 columns
data = []
for year in range(2002, 2021):
for src_idx in range(len(cols)):
for dest_idx in range(len(cols)):
if src_idx == dest_idx:
continue
logic = etf['date'].dt.year == year
src = etf.loc[logic, ['date'] + [cols[src_idx]]]
dest = etf.loc[logic, ['date'] + [cols[dest_idx]]]
merged = pd.merge(src, dest, on='date', how='outer').copy()
merged.dropna(inplace=True)
if len(merged) > 180:
all_transfer_entropy[src_idx][dest_idx] = calc.transfer_entropy(src=merged[cols[src_idx]].values,
dest=merged[cols[dest_idx]].values,
nearest_neighbors=K)
all_transfer_entropy[src_idx][dest_idx] = max(0, all_transfer_entropy[src_idx][dest_idx])
else:
all_transfer_entropy[src_idx][dest_idx] = 0
# row wise summation by source, and then normalize
te = all_transfer_entropy.sum(axis=1)
te /= te.sum()
data += [year] + list(te),
data = pd.DataFrame(data, columns=columns)
data
| Year | XPH | SMH | ICLN | JETS | XLP | USO | IXP | IYT | VGT | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2002 | 0.000000 | 0.351504 | 0.000000 | 0.000000 | 0.546312 | 0.000000 | 0.102184 | 0.000000 | 0.000000 |
| 1 | 2003 | 0.000000 | 0.268880 | 0.000000 | 0.000000 | 0.502318 | 0.000000 | 0.228802 | 0.000000 | 0.000000 |
| 2 | 2004 | 0.000000 | 0.057117 | 0.000000 | 0.000000 | 0.131527 | 0.000000 | 0.255277 | 0.425826 | 0.130253 |
| 3 | 2005 | 0.000000 | 0.201834 | 0.000000 | 0.000000 | 0.109531 | 0.000000 | 0.087163 | 0.359444 | 0.242029 |
| 4 | 2006 | 0.000000 | 0.138424 | 0.000000 | 0.000000 | 0.245103 | 0.000000 | 0.227399 | 0.298876 | 0.090199 |
| 5 | 2007 | 0.023105 | 0.097225 | 0.000000 | 0.000000 | 0.039366 | 0.213104 | 0.068295 | 0.280837 | 0.278068 |
| 6 | 2008 | 0.206011 | 0.027874 | 0.000000 | 0.000000 | 0.047243 | 0.252743 | 0.199062 | 0.106525 | 0.160542 |
| 7 | 2009 | 0.025382 | 0.081523 | 0.067746 | 0.000000 | 0.241051 | 0.085381 | 0.103899 | 0.252535 | 0.142484 |
| 8 | 2010 | 0.074009 | 0.036683 | 0.164379 | 0.000000 | 0.004832 | 0.246538 | 0.268585 | 0.075735 | 0.129240 |
| 9 | 2011 | 0.153696 | 0.022067 | 0.002933 | 0.000000 | 0.245540 | 0.196389 | 0.127495 | 0.063833 | 0.188046 |
| 10 | 2012 | 0.106558 | 0.046596 | 0.126051 | 0.000000 | 0.035346 | 0.160731 | 0.157176 | 0.110552 | 0.256990 |
| 11 | 2013 | 0.157566 | 0.136117 | 0.131832 | 0.000000 | 0.108710 | 0.081415 | 0.163197 | 0.123647 | 0.097515 |
| 12 | 2014 | 0.156866 | 0.071775 | 0.175108 | 0.000000 | 0.015930 | 0.143599 | 0.145213 | 0.117638 | 0.173871 |
| 13 | 2015 | 0.134925 | 0.066612 | 0.121931 | 0.000000 | 0.081826 | 0.123902 | 0.240245 | 0.119352 | 0.111208 |
| 14 | 2016 | 0.030367 | 0.086001 | 0.202885 | 0.169737 | 0.093977 | 0.078701 | 0.168365 | 0.065023 | 0.104944 |
| 15 | 2017 | 0.044076 | 0.074593 | 0.086967 | 0.092864 | 0.125465 | 0.136620 | 0.105720 | 0.232160 | 0.101535 |
| 16 | 2018 | 0.086298 | 0.173048 | 0.016955 | 0.082027 | 0.115597 | 0.097943 | 0.194383 | 0.097919 | 0.135830 |
| 17 | 2019 | 0.106301 | 0.121157 | 0.093962 | 0.223178 | 0.104455 | 0.117606 | 0.083892 | 0.057465 | 0.091983 |
| 18 | 2020 | 0.137117 | 0.132593 | 0.118680 | 0.019373 | 0.235498 | 0.023934 | 0.156946 | 0.095072 | 0.080786 |
data.columns = [industries[col] if col != 'Year' else col for col in data.columns]
data
| Year | Pharmaceutical | Semiconductor | Renewable Energy | Airline | Consumer Staples | Oil | Telecom | Transportation | Technology | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2002 | 0.000000 | 0.351504 | 0.000000 | 0.000000 | 0.546312 | 0.000000 | 0.102184 | 0.000000 | 0.000000 |
| 1 | 2003 | 0.000000 | 0.268880 | 0.000000 | 0.000000 | 0.502318 | 0.000000 | 0.228802 | 0.000000 | 0.000000 |
| 2 | 2004 | 0.000000 | 0.057117 | 0.000000 | 0.000000 | 0.131527 | 0.000000 | 0.255277 | 0.425826 | 0.130253 |
| 3 | 2005 | 0.000000 | 0.201834 | 0.000000 | 0.000000 | 0.109531 | 0.000000 | 0.087163 | 0.359444 | 0.242029 |
| 4 | 2006 | 0.000000 | 0.138424 | 0.000000 | 0.000000 | 0.245103 | 0.000000 | 0.227399 | 0.298876 | 0.090199 |
| 5 | 2007 | 0.023105 | 0.097225 | 0.000000 | 0.000000 | 0.039366 | 0.213104 | 0.068295 | 0.280837 | 0.278068 |
| 6 | 2008 | 0.206011 | 0.027874 | 0.000000 | 0.000000 | 0.047243 | 0.252743 | 0.199062 | 0.106525 | 0.160542 |
| 7 | 2009 | 0.025382 | 0.081523 | 0.067746 | 0.000000 | 0.241051 | 0.085381 | 0.103899 | 0.252535 | 0.142484 |
| 8 | 2010 | 0.074009 | 0.036683 | 0.164379 | 0.000000 | 0.004832 | 0.246538 | 0.268585 | 0.075735 | 0.129240 |
| 9 | 2011 | 0.153696 | 0.022067 | 0.002933 | 0.000000 | 0.245540 | 0.196389 | 0.127495 | 0.063833 | 0.188046 |
| 10 | 2012 | 0.106558 | 0.046596 | 0.126051 | 0.000000 | 0.035346 | 0.160731 | 0.157176 | 0.110552 | 0.256990 |
| 11 | 2013 | 0.157566 | 0.136117 | 0.131832 | 0.000000 | 0.108710 | 0.081415 | 0.163197 | 0.123647 | 0.097515 |
| 12 | 2014 | 0.156866 | 0.071775 | 0.175108 | 0.000000 | 0.015930 | 0.143599 | 0.145213 | 0.117638 | 0.173871 |
| 13 | 2015 | 0.134925 | 0.066612 | 0.121931 | 0.000000 | 0.081826 | 0.123902 | 0.240245 | 0.119352 | 0.111208 |
| 14 | 2016 | 0.030367 | 0.086001 | 0.202885 | 0.169737 | 0.093977 | 0.078701 | 0.168365 | 0.065023 | 0.104944 |
| 15 | 2017 | 0.044076 | 0.074593 | 0.086967 | 0.092864 | 0.125465 | 0.136620 | 0.105720 | 0.232160 | 0.101535 |
| 16 | 2018 | 0.086298 | 0.173048 | 0.016955 | 0.082027 | 0.115597 | 0.097943 | 0.194383 | 0.097919 | 0.135830 |
| 17 | 2019 | 0.106301 | 0.121157 | 0.093962 | 0.223178 | 0.104455 | 0.117606 | 0.083892 | 0.057465 | 0.091983 |
| 18 | 2020 | 0.137117 | 0.132593 | 0.118680 | 0.019373 | 0.235498 | 0.023934 | 0.156946 | 0.095072 | 0.080786 |
data.style.background_gradient(axis=0, subset=list(industries.values()))
| Year | Pharmaceutical | Semiconductor | Renewable Energy | Airline | Consumer Staples | Oil | Telecom | Transportation | Technology | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2002 | 0.000000 | 0.351504 | 0.000000 | 0.000000 | 0.546312 | 0.000000 | 0.102184 | 0.000000 | 0.000000 |
| 1 | 2003 | 0.000000 | 0.268880 | 0.000000 | 0.000000 | 0.502318 | 0.000000 | 0.228802 | 0.000000 | 0.000000 |
| 2 | 2004 | 0.000000 | 0.057117 | 0.000000 | 0.000000 | 0.131527 | 0.000000 | 0.255277 | 0.425826 | 0.130253 |
| 3 | 2005 | 0.000000 | 0.201834 | 0.000000 | 0.000000 | 0.109531 | 0.000000 | 0.087163 | 0.359444 | 0.242029 |
| 4 | 2006 | 0.000000 | 0.138424 | 0.000000 | 0.000000 | 0.245103 | 0.000000 | 0.227399 | 0.298876 | 0.090199 |
| 5 | 2007 | 0.023105 | 0.097225 | 0.000000 | 0.000000 | 0.039366 | 0.213104 | 0.068295 | 0.280837 | 0.278068 |
| 6 | 2008 | 0.206011 | 0.027874 | 0.000000 | 0.000000 | 0.047243 | 0.252743 | 0.199062 | 0.106525 | 0.160542 |
| 7 | 2009 | 0.025382 | 0.081523 | 0.067746 | 0.000000 | 0.241051 | 0.085381 | 0.103899 | 0.252535 | 0.142484 |
| 8 | 2010 | 0.074009 | 0.036683 | 0.164379 | 0.000000 | 0.004832 | 0.246538 | 0.268585 | 0.075735 | 0.129240 |
| 9 | 2011 | 0.153696 | 0.022067 | 0.002933 | 0.000000 | 0.245540 | 0.196389 | 0.127495 | 0.063833 | 0.188046 |
| 10 | 2012 | 0.106558 | 0.046596 | 0.126051 | 0.000000 | 0.035346 | 0.160731 | 0.157176 | 0.110552 | 0.256990 |
| 11 | 2013 | 0.157566 | 0.136117 | 0.131832 | 0.000000 | 0.108710 | 0.081415 | 0.163197 | 0.123647 | 0.097515 |
| 12 | 2014 | 0.156866 | 0.071775 | 0.175108 | 0.000000 | 0.015930 | 0.143599 | 0.145213 | 0.117638 | 0.173871 |
| 13 | 2015 | 0.134925 | 0.066612 | 0.121931 | 0.000000 | 0.081826 | 0.123902 | 0.240245 | 0.119352 | 0.111208 |
| 14 | 2016 | 0.030367 | 0.086001 | 0.202885 | 0.169737 | 0.093977 | 0.078701 | 0.168365 | 0.065023 | 0.104944 |
| 15 | 2017 | 0.044076 | 0.074593 | 0.086967 | 0.092864 | 0.125465 | 0.136620 | 0.105720 | 0.232160 | 0.101535 |
| 16 | 2018 | 0.086298 | 0.173048 | 0.016955 | 0.082027 | 0.115597 | 0.097943 | 0.194383 | 0.097919 | 0.135830 |
| 17 | 2019 | 0.106301 | 0.121157 | 0.093962 | 0.223178 | 0.104455 | 0.117606 | 0.083892 | 0.057465 | 0.091983 |
| 18 | 2020 | 0.137117 | 0.132593 | 0.118680 | 0.019373 | 0.235498 | 0.023934 | 0.156946 | 0.095072 | 0.080786 |
px.area(data, x='Year', y=list(industries.values()), title='Yearly Transfer Entropy Contribution by Sectors', width=1000, height=600)
px.bar(data, x='Year', y=list(industries.values()), title='Yearly Transfer Entropy Contribution by Sectors', width=1000, height=600)
selected_ind = ['USO', 'IYT']
cols = ['year', f'{selected_ind[0]}to{selected_ind[1]}', f'{selected_ind[1]}to{selected_ind[0]}']
K = 4
yearly_stats = []
for yr in range(2000, 2021):
t_data = etf.loc[etf['date'].dt.year == yr, selected_ind].copy()
t_data.dropna(inplace=True)
if len(t_data) > K:
row = [yr]
row += calc.transfer_entropy(src=t_data[selected_ind[0]].values, dest=t_data[selected_ind[1]].values, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data[selected_ind[1]].values, dest=t_data[selected_ind[0]].values, nearest_neighbors=K),
yearly_stats += row,
yearly = pd.DataFrame(yearly_stats, columns=cols)
f = lambda x: max(0, x)
yearly[cols[1]] = yearly[cols[1]].apply(f)
yearly[cols[2]] = yearly[cols[2]].apply(f)
yearly['net_info_flow'] = yearly[cols[1]] - yearly[cols[2]]
yearly.describe()
| year | USOtoIYT | IYTtoUSO | net_info_flow | |
|---|---|---|---|---|
| count | 15.000000 | 15.000000 | 15.000000 | 15.000000 |
| mean | 2013.000000 | 0.015166 | 0.005575 | 0.009590 |
| std | 4.472136 | 0.021157 | 0.012192 | 0.018208 |
| min | 2006.000000 | 0.000000 | 0.000000 | -0.010429 |
| 25% | 2009.500000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 2013.000000 | 0.002459 | 0.000000 | 0.002459 |
| 75% | 2016.500000 | 0.021527 | 0.006197 | 0.015548 |
| max | 2020.000000 | 0.058124 | 0.046387 | 0.057743 |
px.line(yearly, x='year', y=cols[1:], title=f'Transfer Entropy between {industries[selected_ind[0]]} ({selected_ind[0]}) & {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
px.bar(yearly, x='year', y='net_info_flow', title=f'Net Information Flow: {industries[selected_ind[0]]} ({selected_ind[0]}) -> {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
print(f'Net Information Flow {selected_ind[0]} -> {selected_ind[1]}')
yearly.style.bar(subset=[col for col in yearly.columns if col != 'year'], align='mid', color=['#d65f5f', '#5fba7d'], axis=None)
Net Information Flow USO -> IYT
| year | USOtoIYT | IYTtoUSO | net_info_flow | |
|---|---|---|---|---|
| 0 | 2006 | 0.000000 | 0.010429 | -0.010429 |
| 1 | 2007 | 0.014782 | 0.000000 | 0.014782 |
| 2 | 2008 | 0.058124 | 0.000381 | 0.057743 |
| 3 | 2009 | 0.000000 | 0.000000 | 0.000000 |
| 4 | 2010 | 0.008479 | 0.003672 | 0.004807 |
| 5 | 2011 | 0.049397 | 0.014039 | 0.035358 |
| 6 | 2012 | 0.000000 | 0.000000 | 0.000000 |
| 7 | 2013 | 0.000000 | 0.000000 | 0.000000 |
| 8 | 2014 | 0.000000 | 0.000000 | 0.000000 |
| 9 | 2015 | 0.051190 | 0.046387 | 0.004803 |
| 10 | 2016 | 0.000000 | 0.008722 | -0.008722 |
| 11 | 2017 | 0.016315 | 0.000000 | 0.016315 |
| 12 | 2018 | 0.000000 | 0.000000 | 0.000000 |
| 13 | 2019 | 0.026740 | 0.000000 | 0.026740 |
| 14 | 2020 | 0.002459 | 0.000000 | 0.002459 |
selected_ind = ['USO', 'ICLN']
cols = ['year', f'{selected_ind[0]}to{selected_ind[1]}', f'{selected_ind[1]}to{selected_ind[0]}']
K = 4
yearly_stats = []
for yr in range(2000, 2021):
t_data = etf.loc[etf['date'].dt.year == yr, selected_ind].copy()
t_data.dropna(inplace=True)
if len(t_data) > K:
row = [yr]
row += calc.transfer_entropy(src=t_data[selected_ind[0]].values, dest=t_data[selected_ind[1]].values, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data[selected_ind[1]].values, dest=t_data[selected_ind[0]].values, nearest_neighbors=K),
yearly_stats += row,
yearly = pd.DataFrame(yearly_stats, columns=cols)
f = lambda x: max(0, x)
yearly[cols[1]] = yearly[cols[1]].apply(f)
yearly[cols[2]] = yearly[cols[2]].apply(f)
yearly['net_info_flow'] = yearly[cols[1]] - yearly[cols[2]]
yearly.describe()
| year | USOtoICLN | ICLNtoUSO | net_info_flow | |
|---|---|---|---|---|
| count | 13.00000 | 13.000000 | 13.000000 | 13.000000 |
| mean | 2014.00000 | 0.014307 | 0.011559 | 0.002747 |
| std | 3.89444 | 0.022215 | 0.017292 | 0.033181 |
| min | 2008.00000 | 0.000000 | 0.000000 | -0.040631 |
| 25% | 2011.00000 | 0.000000 | 0.000000 | -0.019226 |
| 50% | 2014.00000 | 0.005174 | 0.000000 | 0.000000 |
| 75% | 2017.00000 | 0.016212 | 0.019226 | 0.016212 |
| max | 2020.00000 | 0.074654 | 0.040689 | 0.074654 |
px.line(yearly, x='year', y=cols[1:], title=f'Transfer Entropy between {industries[selected_ind[0]]} ({selected_ind[0]}) & {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
px.bar(yearly, x='year', y='net_info_flow', title=f'Net Information Flow: {industries[selected_ind[0]]} ({selected_ind[0]}) -> {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
print(f'Net Information Flow {selected_ind[0]} -> {selected_ind[1]}')
yearly.style.bar(subset=[col for col in yearly.columns if col != 'year'], align='mid', color=['#d65f5f', '#5fba7d'], axis=None)
Net Information Flow USO -> ICLN
| year | USOtoICLN | ICLNtoUSO | net_info_flow | |
|---|---|---|---|---|
| 0 | 2008 | 0.000000 | 0.040631 | -0.040631 |
| 1 | 2009 | 0.000000 | 0.000000 | 0.000000 |
| 2 | 2010 | 0.012985 | 0.000000 | 0.012985 |
| 3 | 2011 | 0.016212 | 0.000000 | 0.016212 |
| 4 | 2012 | 0.074654 | 0.000000 | 0.074654 |
| 5 | 2013 | 0.000000 | 0.000790 | -0.000790 |
| 6 | 2014 | 0.000000 | 0.000000 | 0.000000 |
| 7 | 2015 | 0.005174 | 0.009446 | -0.004272 |
| 8 | 2016 | 0.000000 | 0.039486 | -0.039486 |
| 9 | 2017 | 0.033601 | 0.000000 | 0.033601 |
| 10 | 2018 | 0.037207 | 0.000000 | 0.037207 |
| 11 | 2019 | 0.000000 | 0.019226 | -0.019226 |
| 12 | 2020 | 0.006152 | 0.040689 | -0.034537 |
selected_ind = ['USO', 'VGT']
cols = ['year', f'{selected_ind[0]}to{selected_ind[1]}', f'{selected_ind[1]}to{selected_ind[0]}']
K = 4
yearly_stats = []
for yr in range(2000, 2021):
t_data = etf.loc[etf['date'].dt.year == yr, selected_ind].copy()
t_data.dropna(inplace=True)
if len(t_data) > K:
row = [yr]
row += calc.transfer_entropy(src=t_data[selected_ind[0]].values, dest=t_data[selected_ind[1]].values, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data[selected_ind[1]].values, dest=t_data[selected_ind[0]].values, nearest_neighbors=K),
yearly_stats += row,
yearly = pd.DataFrame(yearly_stats, columns=cols)
f = lambda x: max(0, x)
yearly[cols[1]] = yearly[cols[1]].apply(f)
yearly[cols[2]] = yearly[cols[2]].apply(f)
yearly['net_info_flow'] = yearly[cols[1]] - yearly[cols[2]]
yearly.describe()
| year | USOtoVGT | VGTtoUSO | net_info_flow | |
|---|---|---|---|---|
| count | 15.000000 | 15.000000 | 15.000000 | 15.000000 |
| mean | 2013.000000 | 0.012530 | 0.023137 | -0.010608 |
| std | 4.472136 | 0.017175 | 0.022957 | 0.028284 |
| min | 2006.000000 | 0.000000 | 0.000000 | -0.043547 |
| 25% | 2009.500000 | 0.000000 | 0.000619 | -0.033670 |
| 50% | 2013.000000 | 0.001089 | 0.015165 | -0.005103 |
| 75% | 2016.500000 | 0.022359 | 0.041073 | 0.000000 |
| max | 2020.000000 | 0.050778 | 0.068384 | 0.050778 |
px.line(yearly, x='year', y=cols[1:], title=f'Transfer Entropy between {industries[selected_ind[0]]} ({selected_ind[0]}) & {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
px.bar(yearly, x='year', y='net_info_flow', title=f'Net Information Flow: {industries[selected_ind[0]]} ({selected_ind[0]}) -> {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
print(f'Net Information Flow {selected_ind[0]} -> {selected_ind[1]}')
yearly.style.bar(subset=[col for col in yearly.columns if col != 'year'], align='mid', color=['#d65f5f', '#5fba7d'], axis=None)
Net Information Flow USO -> VGT
| year | USOtoVGT | VGTtoUSO | net_info_flow | |
|---|---|---|---|---|
| 0 | 2006 | 0.007685 | 0.012788 | -0.005103 |
| 1 | 2007 | 0.000000 | 0.000000 | 0.000000 |
| 2 | 2008 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 2009 | 0.001089 | 0.005717 | -0.004628 |
| 4 | 2010 | 0.000000 | 0.028965 | -0.028965 |
| 5 | 2011 | 0.027670 | 0.068384 | -0.040714 |
| 6 | 2012 | 0.000000 | 0.038599 | -0.038599 |
| 7 | 2013 | 0.000000 | 0.000000 | 0.000000 |
| 8 | 2014 | 0.011442 | 0.045446 | -0.034004 |
| 9 | 2015 | 0.000000 | 0.033336 | -0.033336 |
| 10 | 2016 | 0.031120 | 0.053878 | -0.022758 |
| 11 | 2017 | 0.050778 | 0.000000 | 0.050778 |
| 12 | 2018 | 0.017048 | 0.015165 | 0.001883 |
| 13 | 2019 | 0.041116 | 0.001238 | 0.039878 |
| 14 | 2020 | 0.000000 | 0.043547 | -0.043547 |
selected_ind = ['SMH', 'VGT']
cols = ['year', f'{selected_ind[0]}to{selected_ind[1]}', f'{selected_ind[1]}to{selected_ind[0]}']
K = 4
yearly_stats = []
for yr in range(2000, 2021):
t_data = etf.loc[etf['date'].dt.year == yr, selected_ind].copy()
t_data.dropna(inplace=True)
if len(t_data) > K:
row = [yr]
row += calc.transfer_entropy(src=t_data[selected_ind[0]].values, dest=t_data[selected_ind[1]].values, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data[selected_ind[1]].values, dest=t_data[selected_ind[0]].values, nearest_neighbors=K),
yearly_stats += row,
yearly = pd.DataFrame(yearly_stats, columns=cols)
f = lambda x: max(0, x)
yearly[cols[1]] = yearly[cols[1]].apply(f)
yearly[cols[2]] = yearly[cols[2]].apply(f)
yearly['net_info_flow'] = yearly[cols[1]] - yearly[cols[2]]
yearly.describe()
| year | SMHtoVGT | VGTtoSMH | net_info_flow | |
|---|---|---|---|---|
| count | 17.000000 | 17.000000 | 17.000000 | 17.000000 |
| mean | 2012.000000 | 0.012225 | 0.005526 | 0.006698 |
| std | 5.049752 | 0.021376 | 0.012672 | 0.013850 |
| min | 2004.000000 | 0.000000 | 0.000000 | -0.020193 |
| 25% | 2008.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 2012.000000 | 0.001628 | 0.000000 | 0.000000 |
| 75% | 2016.000000 | 0.015738 | 0.003988 | 0.013852 |
| max | 2020.000000 | 0.081237 | 0.049729 | 0.034109 |
px.line(yearly, x='year', y=cols[1:], title=f'Transfer Entropy between {industries[selected_ind[0]]} ({selected_ind[0]}) & {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
px.bar(yearly, x='year', y='net_info_flow', title=f'Net Information Flow: {industries[selected_ind[0]]} ({selected_ind[0]}) -> {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
print(f'Net Information Flow {selected_ind[0]} -> {selected_ind[1]}')
yearly.style.bar(subset=[col for col in yearly.columns if col != 'year'], align='mid', color=['#d65f5f', '#5fba7d'], axis=None)
Net Information Flow SMH -> VGT
| year | SMHtoVGT | VGTtoSMH | net_info_flow | |
|---|---|---|---|---|
| 0 | 2004 | 0.000000 | 0.000000 | 0.000000 |
| 1 | 2005 | 0.000000 | 0.000000 | 0.000000 |
| 2 | 2006 | 0.022229 | 0.001455 | 0.020774 |
| 3 | 2007 | 0.001628 | 0.021820 | -0.020193 |
| 4 | 2008 | 0.000000 | 0.000000 | 0.000000 |
| 5 | 2009 | 0.000000 | 0.000000 | 0.000000 |
| 6 | 2010 | 0.000000 | 0.000000 | 0.000000 |
| 7 | 2011 | 0.000000 | 0.003988 | -0.003988 |
| 8 | 2012 | 0.000000 | 0.000000 | 0.000000 |
| 9 | 2013 | 0.013852 | 0.000000 | 0.013852 |
| 10 | 2014 | 0.015738 | 0.002113 | 0.013625 |
| 11 | 2015 | 0.002192 | 0.005599 | -0.003406 |
| 12 | 2016 | 0.000000 | 0.000000 | 0.000000 |
| 13 | 2017 | 0.008766 | 0.000000 | 0.008766 |
| 14 | 2018 | 0.081237 | 0.049729 | 0.031509 |
| 15 | 2019 | 0.018824 | 0.000000 | 0.018824 |
| 16 | 2020 | 0.043350 | 0.009241 | 0.034109 |
selected_ind = ['IYT', 'ICLN']
cols = ['year', f'{selected_ind[0]}to{selected_ind[1]}', f'{selected_ind[1]}to{selected_ind[0]}']
K = 4
yearly_stats = []
for yr in range(2000, 2021):
t_data = etf.loc[etf['date'].dt.year == yr, selected_ind].copy()
t_data.dropna(inplace=True)
if len(t_data) > K:
row = [yr]
row += calc.transfer_entropy(src=t_data[selected_ind[0]].values, dest=t_data[selected_ind[1]].values, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data[selected_ind[1]].values, dest=t_data[selected_ind[0]].values, nearest_neighbors=K),
yearly_stats += row,
yearly = pd.DataFrame(yearly_stats, columns=cols)
f = lambda x: max(0, x)
yearly[cols[1]] = yearly[cols[1]].apply(f)
yearly[cols[2]] = yearly[cols[2]].apply(f)
yearly['net_info_flow'] = yearly[cols[1]] - yearly[cols[2]]
yearly.describe()
| year | IYTtoICLN | ICLNtoIYT | net_info_flow | |
|---|---|---|---|---|
| count | 13.00000 | 13.000000 | 13.000000 | 13.000000 |
| mean | 2014.00000 | 0.014076 | 0.009527 | 0.004549 |
| std | 3.89444 | 0.024133 | 0.021796 | 0.025976 |
| min | 2008.00000 | 0.000000 | 0.000000 | -0.040289 |
| 25% | 2011.00000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 2014.00000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 2017.00000 | 0.029084 | 0.011814 | 0.002226 |
| max | 2020.00000 | 0.078221 | 0.079073 | 0.078221 |
px.line(yearly, x='year', y=cols[1:], title=f'Transfer Entropy between {industries[selected_ind[0]]} ({selected_ind[0]}) & {industries[selected_ind[1]]} ({selected_ind[1]})', height=600, width=1000)
px.bar(yearly, x='year', y='net_info_flow', title=f'Net Information Flow: {industries[selected_ind[0]]} ({selected_ind[0]}) -> {industries[selected_ind[1]]} ({selected_ind[1]})', width=1000, height=600)
print(f'Net Information Flow {selected_ind[0]} -> {selected_ind[1]}')
yearly.style.bar(subset=[col for col in yearly.columns if col != 'year'], align='mid', color=['#d65f5f', '#5fba7d'], axis=None)
Net Information Flow IYT -> ICLN
| year | IYTtoICLN | ICLNtoIYT | net_info_flow | |
|---|---|---|---|---|
| 0 | 2008 | 0.038783 | 0.079073 | -0.040289 |
| 1 | 2009 | 0.078221 | 0.000000 | 0.078221 |
| 2 | 2010 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 2011 | 0.000000 | 0.000000 | 0.000000 |
| 4 | 2012 | 0.032854 | 0.015767 | 0.017087 |
| 5 | 2013 | 0.000000 | 0.011814 | -0.011814 |
| 6 | 2014 | 0.000000 | 0.000000 | 0.000000 |
| 7 | 2015 | 0.000000 | 0.000000 | 0.000000 |
| 8 | 2016 | 0.004040 | 0.001815 | 0.002226 |
| 9 | 2017 | 0.029084 | 0.015384 | 0.013700 |
| 10 | 2018 | 0.000000 | 0.000000 | 0.000000 |
| 11 | 2019 | 0.000000 | 0.000000 | 0.000000 |
| 12 | 2020 | 0.000000 | 0.000000 | 0.000000 |
delta = pd.read_csv('data/DAL.csv')
exxon = pd.read_csv('data/XOM.csv')
for df in [delta, exxon]:
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
df['date'] = pd.to_datetime(df['date'])
df['daily_return'] = daily_return(df)
cols = ['date', 'daily_return']
df = pd.merge(delta[cols], exxon[cols], on='date', how='inner')
df = df.iloc[1:,:]
df.columns = ['date', 'Delta', 'Exxon']
df
| date | Delta | Exxon | |
|---|---|---|---|
| 1 | 2007-05-07 | -0.029997 | 0.003470 |
| 2 | 2007-05-08 | -0.036514 | 0.006781 |
| 3 | 2007-05-09 | 0.008118 | 0.000491 |
| 4 | 2007-05-10 | -0.004558 | -0.020940 |
| 5 | 2007-05-11 | -0.020514 | 0.022912 |
| ... | ... | ... | ... |
| 3394 | 2020-10-26 | -0.062815 | -0.023998 |
| 3395 | 2020-10-27 | -0.038958 | -0.016020 |
| 3396 | 2020-10-28 | -0.035126 | -0.038831 |
| 3397 | 2020-10-29 | 0.036753 | 0.043391 |
| 3398 | 2020-10-30 | -0.003909 | -0.010673 |
3398 rows × 3 columns
selected = ['Exxon', 'Delta']
cols = ['year', f'{selected[0]}To{selected[1]}', f'{selected[1]}To{selected[0]}']
K = 4
yearly_stats = []
for yr in range(2007, 2021):
t_data = df.loc[df['date'].dt.year == yr, selected].copy()
t_data.dropna(inplace=True)
if len(t_data) > K:
row = [yr]
row += calc.transfer_entropy(src=t_data[selected[0]].values, dest=t_data[selected[1]].values, nearest_neighbors=K),
row += calc.transfer_entropy(src=t_data[selected[1]].values, dest=t_data[selected[0]].values, nearest_neighbors=K),
yearly_stats += row,
yearly = pd.DataFrame(yearly_stats, columns=cols)
f = lambda x: max(0, x)
yearly[cols[1]] = yearly[cols[1]].apply(f)
yearly[cols[2]] = yearly[cols[2]].apply(f)
yearly['net_info_flow'] = yearly[cols[1]] - yearly[cols[2]]
print(f'Net Information Flow {selected[0]} -> {selected[1]}')
yearly.style.bar(subset=['net_info_flow'], align='mid', color=['#d65f5f', '#5fba7d'])
Net Information Flow Exxon -> Delta
| year | ExxonToDelta | DeltaToExxon | net_info_flow | |
|---|---|---|---|---|
| 0 | 2007 | 0.000000 | 0.027230 | -0.027230 |
| 1 | 2008 | 0.000000 | 0.015746 | -0.015746 |
| 2 | 2009 | 0.030754 | 0.055017 | -0.024263 |
| 3 | 2010 | 0.015983 | 0.000000 | 0.015983 |
| 4 | 2011 | 0.004166 | 0.000000 | 0.004166 |
| 5 | 2012 | 0.000000 | 0.000000 | 0.000000 |
| 6 | 2013 | 0.000000 | 0.000000 | 0.000000 |
| 7 | 2014 | 0.000000 | 0.000000 | 0.000000 |
| 8 | 2015 | 0.000000 | 0.032663 | -0.032663 |
| 9 | 2016 | 0.049049 | 0.003398 | 0.045651 |
| 10 | 2017 | 0.000000 | 0.000000 | 0.000000 |
| 11 | 2018 | 0.000000 | 0.011026 | -0.011026 |
| 12 | 2019 | 0.000000 | 0.045428 | -0.045428 |
| 13 | 2020 | 0.000000 | 0.033590 | -0.033590 |
print(f'Net Information Flow {selected[0]} -> {selected[1]}')
yearly.style.bar(subset=['net_info_flow'], align='mid', color=['#d65f5f', '#5fba7d'])
Net Information Flow Exxon -> Delta
| year | ExxonToDelta | DeltaToExxon | net_info_flow | |
|---|---|---|---|---|
| 0 | 2007 | 0.000000 | 0.027230 | -0.027230 |
| 1 | 2008 | 0.000000 | 0.015746 | -0.015746 |
| 2 | 2009 | 0.030754 | 0.055017 | -0.024263 |
| 3 | 2010 | 0.015983 | 0.000000 | 0.015983 |
| 4 | 2011 | 0.004166 | 0.000000 | 0.004166 |
| 5 | 2012 | 0.000000 | 0.000000 | 0.000000 |
| 6 | 2013 | 0.000000 | 0.000000 | 0.000000 |
| 7 | 2014 | 0.000000 | 0.000000 | 0.000000 |
| 8 | 2015 | 0.000000 | 0.032663 | -0.032663 |
| 9 | 2016 | 0.049049 | 0.003398 | 0.045651 |
| 10 | 2017 | 0.000000 | 0.000000 | 0.000000 |
| 11 | 2018 | 0.000000 | 0.011026 | -0.011026 |
| 12 | 2019 | 0.000000 | 0.045428 | -0.045428 |
| 13 | 2020 | 0.000000 | 0.033590 | -0.033590 |
px.line(yearly, x='year', y=cols[1:], title=f'Transfer Entropy between {selected[0]} & {selected[1]}', height=600, width=1000)
yearly['net_info_flow'] = yearly[cols[1]].apply(f) - yearly[cols[2]].apply(f)
px.bar(yearly, x='year', y='net_info_flow', title=f'Net Information Flow: {selected[0]} -> {selected[1]}', width=1000, height=600)
px.line(df, x='date', y=['Delta', 'Exxon'], title='Daily return of Exxon and Delta Airlines', width=1000, height=600)
Articles on Exxon Mobil from 2016: Link
